#!/bin/sh
# affix compressor utility for Hunspell
# 2008 (c) László Németh, version 0.3
# usage: affixcompress sorted_word_list_file [max_affix_rules]
case $# in 
0) echo \
"affixcompress - compress a huge sorted word list to Hunspell format
Usage: 

LC_ALL=C sort word_list >sorted_word_list
affixcompress sorted_word_list [max_affix_rules]

Default value of max_affix_rules = 5000

Note: output may need manually added affix parameters (SET character_encoding,
TRY suggestion_characters etc., see man(4) hunspell)"
   exit 0;;
esac

MAXAFFIX=${2:-5000}

# profiling
#AWK="pgawk --profile"
AWK="awk"
if which gawk; then
    AWK="gawk"
fi

rm -f $1.aff $1.dic
cat $1 | $AWK '
{
    # calculate frequent suffixes
    A[$1] = 1
    len = length($1)
    if (len > 2) {
#        print $1, substr($1, 1, len - 1), substr($1, len, 1) >"/dev/stderr"
        B[substr($1, 1, len - 1)] = substr($1, len, 1);
    }
    for(i = 2; i < len; i++) {
        r = substr($1, 1, i)
        if (i == 2) {
            if (prev != r) {
                delete A
                delete B
                print "Deleted roots: ", prev > "/dev/stderr"
                A[$1] = 1
            }
            prev = r
        }
        if (A[r]) {
#        print $1 ": " r " és "substr($1, i + 1, len - i + 1) >"/dev/stderr"
        sfx[substr($1, i + 1, len - i + 1)]++
        } else if (B[r] && B[r] != substr($1, i + 1, 1)) {
            r2 = substr($1, i + 1, len - i + 1)
            sfy[r2,B[r]]++
        }
    }
}
END {
    for (i in sfx) print i, 0, sfx[i]
    for (i in sfy) print i, sfy[i]
}
' | tr '\034' ' ' >affixcompress0.tmp
sort -rnk 3 affixcompress0.tmp | $AWK '$3 >= 1{print $0}' |
head -$MAXAFFIX >affixcompress1.tmp
cat affixcompress1.tmp |
$AWK '
function potential_roots() {
    # potential roots with most frequent suffixes
    for(word in W) if (W[word]==1) {
        print word >"word"
        len = length(word);
        for(i = 2; i < len; i++) {
            root = substr(word, 1, i)
            suff = substr(word, i + 1, len - i + 1)
            if ((W[root]!="") && (sfxfr[suff] > 100)) C[root]++
            if (sfz[suff]) {
                l = split(sfz[suff], a)
                for (k=1; k <= l; k++) if ((W[root a[k]]!="") && (sfyfr[root a[k]] > 100)) {
                    C[root a[k]]++
                }
            }
        }
    }

    # calculate roots
    for(word in W) if (W[word]==1) {
        print word >"word2"
        len = length(word);
        z = 0
        # choose most frequent root (maybe the original word)
        max = C[word]
        maxword = word
        maxsuff = 0
        for(i = 2; i < len; i++) {
            root = substr(word, 1, i)
            suff = substr(word, i + 1, len - i + 1)
            if ((sfx[suff] != "") && (C[root] > max)) {
                max = C[root]
                maxword = root
                maxsuff = sfx[suff]
            }
            if (sfz[suff] != "") {
                l = split(sfz[suff], a)
                for (k=1; k <= l; k++) if (C[root a[k]] > max) {
                    max = C[root a[k]]
                    maxword = root a[k]
                    maxsuff = sfy[suff,a[k]]
                }
            }
        }
        if (max > 0) {
            if (maxsuff > 0) print maxword, maxsuff; else print maxword
            A[maxword]++
            z=1
        } else {
            for(i = 2; i < len; i++) {
                root = substr(word, 1, i)
                suff = substr(word, i + 1, len - i + 1)
                if ((A[root] > 0) && sfx[suff]!="") {
                    print root, sfx[suff]
                    z = 1
                    break
                }
                if (sfz[suff]) {
                    l = split(sfz[suff], a)
                    for (k=1; k <= l; k++) if (A[root a[k]]!="") {
                        print root a[k], sfy[suff,a[k]]
                        z = 1
                        break
                    }
                }
            }
        }
        if (z == 0) {
            print word
            A[word]++
        }
    }
    delete A
    delete C
}
FILENAME == "-" {
    if ($2 == 0) { 
        sfx[$1] = NR
        sfxfr[$1] = $3
    } else {
        sfy[$1,$2] = NR
        sfyfr[$1,$2] = $3
        sfz[$1] = sfz[$1] " " $2
    }
    maxsuf = NR
    next
}
{ 
  cap = substr($1, 1, 3)
  if (cap != prev) {
    potential_roots()
    delete W
    print "Deleted class:", prev > "/dev/stderr"
  }
  prev = cap
  W[$1] = 1
}
END {
    potential_roots()
    # write out frequent suffixes
    out=FILENAME ".aff"
    print "FLAG num" >out
    for (i in sfx) if (sfx[i] > 0) {
        print "SFX", sfx[i], "Y 1" >out
        print "SFX", sfx[i], "0", i, "." >out
    }
    for (i in sfy) if (sfy[i] > 0) {
        print "SFX", sfy[i], "Y 1" >out
        split(i, c, "\034");
        print "SFX", sfy[i], c[2], c[1], c[2] >out
    }
}
' - $1 >affixcompress2.tmp
sort -nk 2 affixcompress2.tmp >affixcompress3.tmp
cat affixcompress3.tmp | $AWK -v out="$1.dic" '
{
    if (A[$1]=="") A[$1]=$2;
    else if ($2!="") A[$1] = A[$1] "," $2
}
END { 
    for (i in A) n++
    print n >out
    for (i in A) {
        if (A[i]=="") print i
        else print i "/" A[i]
    }
}
' | sort >>$1.dic
